Shang and Zhou pottery data exploration

April 16, 2025

Overview

Current file presents a workflow of the analysis made so far for the project. Though basically it was made for visualization, the main processing functions & code blocks are also present within a file, and can be verified or tested by unfolding buttons on the right.

Loadings

These are auxillary functions for library loadings and plot saving.
settings.r contains most of the static parameters, dictionaries and conditions.

wd <- getwd()           # Set the working directory manually if launching in R GUI
source("load_pckgs.r")  # Load the library loader function
source("settings.r")    # Load the settings
source("plot_saver.r")  # Load the function saving the plots to png

Settings:

# file_names & paths
    info_and_carb_data_file_name <- "Data.xlsx" # the name of your excel file with info data
    petro_data_file_name <- "petrography.xlsx" # the name for petrgography data file
    gcms_data_file_name <- "Preliminary analysis Shang and Zhou.xlsx" # the name for gcms data
    inpath <- file.path(wd, "input")
    outpath <- file.path(wd, "output")
    path <- "p:/2025-vessels/spectra/mz"

# plot settings
    parts <- c("lip", "neck", "shoulder", "body", "foot", "crotch") # sequence of vessel parts
    gap <- 250 # gap used for plotting
    cex <- 10 # font size setting
    
    drywetColors <- c("WET" = "#1f77b4", "DRY" = "#ff7f0e") # colors for dry and wet in pie charts
    na_col <- "#b8241a" # color for a missing vessel part
     # list of modes for carb_plotter function
    file_format <- "pdf" # preferred file format for plots

# procesing setting
    print_res <- TRUE
    excluded_vars <- c("c.conf", "g.conf")

# dictionaries
    # The rule set for dry/wet assignment
        dry_wet_conditions <- data.frame(
            query_string = c(
                "col_in_neck == 'carb' & col_in_shoulder == 'carb' & col_in_body == 'carb'", 
                "col_in_neck == 'clear' & col_in_shoulder == 'carb' & col_in_body == 'carb'", 
                "col_in_neck != 'water' & !is.na(col_in_neck) & col_in_shoulder != 'water' & !is.na(col_in_shoulder) & col_in_body == 'carb'", 
                "col_in_neck == 'water' | col_in_shoulder == 'water' | col_in_body == 'water'",
                "col_in_neck == 'carb' & !is.na(col_in_shoulder) & col_in_body == 'clear'",
                "col_in_neck == 'clear' & col_in_shoulder == 'carb' & col_in_body == 'clear'",
                "col_in_neck == 'clear' & col_in_shoulder == 'clear' & col_in_body == 'clear'",
                "col_in_foot == 'water'",
                "TRUE"
            ),
            value = c(-3, -2, -1, 3, 2, 2, 1, 1, 0),
            header = c(
                "DRY, level 3", "DRY, level 2", "DRY, level 1", "WET, level 3", "WET, level 2", "WET, level 2", "WET, level 1", "WET, level 1", "na"
            ),
            help = c(
                "from neck to body: CARB CARB CARB", "from neck to body: CLEAR CARB CARB", "from neck to body: CLEAR CLEAR CARB", "from neck to body: ANY is WATER", "from neck to body: CARB ANY CLEAR", "from neck to body: CLEAR CARB CLEAR", "from neck to body: CLEAR CLEAR CLEAR", "foot is water and body is absent", "the rest combinations"
            )
        )

# Variable descriptions
    var_info <- list(
        "id"                      = list(type = "text id", units = NULL, options = NULL, descr = "Unique sample id"),
        "i.site"                  = list(type = "txt fct", units = NULL, options = c("Gaoqingcaopo", "Kanjiazhai", "Yangxinliwu"), descr = "The name of the site"),
        "i.period"                = list(type = "txt ord", units = NULL, options = c("Shang", "Zhou"), descr = "The dynastie"),
        "i.lip"                   = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if lip is present in a sample"),
        "i.neck"                  = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if neck is present in a sample"),
        "i.shoulder"              = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if shoulder is present in a sample"),
        "i.body"                  = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if body is present in a sample"),
        "i.foot"                  = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if foot is present in a sample"),
        "i.crotch"                = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "TRUE if crotch is present in a sample"),
        "i.completeness"          = list(type = "num dbl", units = "%%", options = NULL, descr = "100 means all parts are present in a sample, less means a fragment is missing some parts"),
        "i.collection"            = list(type = "txt fct", units = NULL, options = NULL, descr = "A combination of period and a first letter of the site"),

        "m.body_width"            = list(type = "num dbl", units = "mm", options = NULL, descr = "Width of the body"),
        "m.height"                = list(type = "num dbl", units = "mm", options = NULL, descr = "Height of the vessel fragment"),
        "m.rim"                   = list(type = "num dbl", units = "mm", options = NULL, descr = "The size of the rim"),                  
        "m.orifice"               = list(type = "num dbl", units = "mm", options = NULL, descr = "The size of the orifice"),
        "m.cord_mark_width"       = list(type = "num dbl", units = "mm", options = NULL, descr = "The width of a cordmark"),      
        "m.cord_mark_type"        = list(type = "txt fct", units = "mm", options = NULL, descr = "Cordmark type"),

        "c.col_in_lip"            = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the lip part from inside"),           
        "c.col_in_neck"           = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the neck part from inside"),
        "c.col_in_shoulder"       = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the shoulder part from inside"),      
        "c.col_in_body"           = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the body part from inside"),
        "c.col_out_lip"           = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the lip part from outside"),          
        "c.col_out_neck"          = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the neck part from outside"),
        "c.col_out_shoulder"      = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the shoulder part from outside"),     
        "c.col_out_body"          = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the body part from outside"),
        "c.col_in_foot"           = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the foot part from inside"),          
        "c.col_out_foot"          = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the foot part from outside"),
        "c.col_out_crotch"        = list(type = "txt fct", units = NULL, options = c("carb", "clear", "water"), descr = "A coloration of the crotch part from outside"),       
        "c.group"                 = list(type = "txt ord", units = NULL, options = c("dry", "wet"), descr = "A classification according to carbonisation analysis"),
        "c.conf"                  = list(type = "num dbl", units = "%", options = c(0.3, 0.5, 1), descr = "A confidence level for 'Dry or Wet' classification, 1 means dead certainty"),

        "g.lipids_conc"           = list(type = "num dbl", units = "%", options = NULL, descr = "A concentration of lipids in a sample, promille?"),
        "g.scfa"                  = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of short-chain fatty acids (C2:C6)"),
        "g.mcfa"                  = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of medium-chain fatty acids (C6:C12)"),
        "g.lcfa"                  = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of long-chain fatty acids (>C12)"),
        "g.uns_fa"                = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of unsaturated fatty acids"),
        "g.diacids"               = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of dicarboxylic acids: oxidized fatty acids with two carboxyl groups"),
        "g.alkanes"               = list(type = "num dbl", units = "%", options = NULL, descr = "Relative abundance of saturated hydrocarbons"),
        "g.p_s"                   = list(type = "num dbl", units = NULL, options = NULL, descr = "Pristane/Phytane ratio ??"),
        "g.o_s"                   = list(type = "num dbl", units = NULL, options = NULL, descr = "Odd-over-even carbon number predominance ??"),
        "g.aapac18e_h"            = list(type = "num dbl", units = NULL, options = NULL, descr = "An index related to C18 alkylphenanthrene homologs (E/H variants) ??"),
        "g.srr"                   = list(type = "num dbl", units = NULL, options = NULL, descr = "A sterane ratio expressed as a percentage ??"),
        "g.group"                 = list(type = "txt ord", units = NULL, options = c("plant", "animal", "mixture"), descr = "A classification for organics source based on expert analysis"),
        "g.conf"                  = list(type = "num dbl", units = NULL, options = NULL, descr = "A confidence level for organic source classification (g.group), 1 means dead certainty"),
        "g.plant_count"           = list(type = "num int", units = NULL, options = NULL, descr = "A rowwise count for 'plant'"),
        "g.tree_count"            = list(type = "num int", units = NULL, options = NULL, descr = "A rowwise count for 'tree'"),
        "g.cereal"                = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'cereal' was mention in the comments"),               
        "g.fruit"                 = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'fruit' was mention in the comments"),
        "g.vegetable"             = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'vegetable' was mention in the comments"),            
        "g.resin"                 = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'resin' was mention in the comments"),
        "g.fish"                  = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'fish' was mention in the comments"),                 
        "g.complex_mxtr"          = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'complex mixture' was mention in the comments"),

        "p.optical_activity"      = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if optical activity is 'Active'"),     
        "p.mica"                  = list(type = "txt ord", units = NULL, options = NULL, descr = "The relative abundance of mica minerals"),
        "p.group"                 = list(type = "txt fct", units = NULL, options = NULL, descr = "A classification for petrography based on expert analysis"),                
        "p.lower_fr_bound"        = list(type = "num dbl", units = "mm", options = NULL, descr = "A lower bound for rock fragments approximate size range"),
        "p.upper_fr_bound"        = list(type = "num dbl", units = NULL, options = NULL, descr = "An upper bound for rock fragments approximate size range"),       
        "p.lower_roundness_bound" = list(type = "txt ord", units = NULL, options = c("angular", "subangular", "subround", "round"), descr = "A lower bound for rock roundndess"),
        "p.upper_roundness_bound" = list(type = "txt ord", units = NULL, options = c("angular", "subangular", "subround", "round"), descr = "An upper bound for rock roundndess"),
        "p.granite"               = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'granite' was mentioned in the comments"),
        "p.granodiorite"          = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'granodiorite' was mentioned in the comments"),         
        "p.diorite"               = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'diorite' was mentioned in the comments"),
        "p.sandstone"             = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'sandstone' was mentioned in the comments"),            
        "p.mudstone"              = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'mudstone' was mentioned in the comments"),
        "p.limestone"             = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'limestone' was mentioned in the comments"),            
        "p.gritstone"             = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'gritstone' was mentioned in the comments"),
        "p.volcanic"              = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'volcanic' was mentioned in the comments"),             
        "p.andesite"              = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'andesite' was mentioned in the comments"),
        "p.microgranite"          = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'microgranite' was mentioned in the comments"),
        "p.ksp"                   = list(type = "lgl", units = NULL, options = c("TRUE", "FALSE"), descr = "True if 'ksp' was mentioned in the comments")
    )

# GCMS var description
gcms_spectra_var_info <- list(
    "seqNum" = list(units = NULL, descr = "Unique sequence number (ID)"),
    "acquisitionNum" = list(units = NULL, descr = "Unique sequence number (ID)"),
    "msLevel" = list(units = NULL, descr = "The level of mass spectrometry data: level 1 meaning only one stage of mass spectrometry was performed (no MS/MS fragmentation)"),
    "polarity" = list(units = NULL, descr = "The value -1 indicates the scans were recorded in negative ion mode"),
    "peaksCount" = list(units = NULL, descr = "How many peaks (detected ions) were recorded in each scan"),
    "totIonCurrent" = list(units = NULL, descr = "The total ion current, representing the overall signal strength"),
    "retentionTime" = list(units = "s", descr = "The time taken for a solute to pass through a chromatography column"),
    "basePeakMZ" = list(units = NULL, descr = "The mass-to-charge (m/z) ratio of the most intense peak"),
    "basePeakIntensity" = list(units = NULL, descr = "The intensity of the most intense peak"),
    "centroided" = list(units = NULL, descr = "TRUE indicates that the data points have already been processed to represent peak centers")
)

Library loading function:

load_pkg <- function(pkg) {
    # Input validation
        if (!is.character(pkg) || length(pkg) != 1) {
            stop("Parameter 'pkg' must be a single character string.")
        }
    
    # Check if the package is installed; if not, attempt installation.
        if (!requireNamespace(pkg, quietly = TRUE)) {
            tryCatch({
                message("Installing ", pkg)
                install.packages(pkg, dependencies = TRUE, repos = "https://cloud.r-project.org", quiet = TRUE)
            }, error = function(e) {
                stop(sprintf("Installation failed for package '%s': %s", pkg, e$message))
            })
        }
        
    # Attempt to load the package.
        if (!paste0("package:", pkg) %in% search()) {
            tryCatch({
                #message("Loading ", pkg)
                suppressPackageStartupMessages(library(pkg, character.only = TRUE))
            }, error = function(e) {
                stop(sprintf("Loading failed for package '%s': %s", pkg, e$message))
            })
        }
    
    invisible(pkg)
}

Plot saving function:

plot_saver <- function(object, path, name, width, height, dpi) {
  source("load_pckgs.r")
  dependencies <- c("ggplot2", "knitr")
  full_path <- file.path(path, name)
  
  # Save the plot using ggsave
  ggsave(
    filename = full_path,
    plot = object,
    width = width,
    height = height,
    dpi = dpi
  )
  
  # Generate HTML code for clickable full-width image
  html_code <- paste0(
    '<a href="', full_path, 
    '" data-lightbox="gallery">',
    '<img src="', full_path, 
    '" class="full-width" alt="Plot image" />',
    '</a>'
  )
  
  knitr::asis_output(html_code)
}

Analysis

Carbonization Pattern Plotter

The function loads the raw data from info_and_carb_data_file_name (parameter set in settings.r) and presents mean carbonisation patterns per each vessel part using in several modes: "by_site_and_period", "by_site", or "by_period".

Carbonization Pattern Plotter Function:

carb_plotter <- function(mode = "by_site_and_period", gap = 250) {

    modes <- c("by_site_and_period", "by_site", "by_period")

    # loadings
        source("plot_carb_pattern.r")
        source("settings.r")
        source("load_pckgs.r")
        source("plot_saver.r")
        dependencies <- c("readxl", "dplyr", "tidyr", "ggplot2", "grid", "gridExtra", "purrr", "sysfonts", "showtext", "knitr")
        invisible(lapply(dependencies, load_pkg))
        sysfonts::font_add_google("Open Sans", "opensans")
        showtext::showtext_auto()
        grDevices::windowsFonts(opensans = windowsFont("Open Sans"))

        data <- readxl::read_excel(file.path(inpath, info_and_carb_data_file_name), na = "")

    # filtering data
        carb <- data %>%
            dplyr::select(
                record_id,
                site_name, period,
                dplyr::starts_with("col_")
            ) %>%
            tidyr::pivot_longer(
                cols = dplyr::starts_with("col_"),
                names_to = "part",
                values_to = "state"
            ) %>%
            dplyr::mutate(
                side = ifelse(grepl("col_in_", part), "IN", "OUT"),
                part = gsub("col_(in|out)_", "", part)
            ) %>%
            dplyr::filter(
                state %in% c("carb", "clear")
            )

    # drawing a pot perimeter
        x <- c(0, 373, 375, 368, 344,
            322, 316, 342, 368, 384,
            397, 399, 395, 384, 366,
            346, 289, 238, 179, 128,
            62, 0)

        y <- -c(0, 0, 17, 35, 53,
            77, 99, 152, 234, 357,
            514, 630, 785, 856, 873, 
            867, 818, 765, 715, 679,
            657, 648)

        perimeter_IN <- data.frame( x = x, y = y )
        perimeter_OUT <- data.frame( x = -x-gap*2, y = y )

    # setting coloration zones
        lip <- rbind( perimeter_IN[1:4,], c(0, perimeter_IN[4,2]) )
        neck <- rbind( perimeter_IN[4:7,], c(0, perimeter_IN[7,2]), c(0, perimeter_IN[4,2]) )
        shoulder <- rbind( perimeter_IN[7:9,], c(0, perimeter_IN[9,2]), c(0, perimeter_IN[7,2]) )
        body <- rbind( perimeter_IN[9:12,], c(0, perimeter_IN[12,2]), c(0, perimeter_IN[9,2]) )
        foot <- rbind( perimeter_IN[12:19,], c(perimeter_IN[19,1], perimeter_IN[12,2]) )
        crotch <- rbind( perimeter_IN[19:22,], c(perimeter_IN[22,1], perimeter_IN[12,2]), c(perimeter_IN[19,1], perimeter_IN[12,2]) )

        polygons_OUT <- list(
            lip = lip,
            neck = neck,
            shoulder = shoulder,
            body = body,
            foot = foot,
            crotch = crotch
            )

        mirror <- function(df) {
            df$x <- df$x *(-1)-gap*2
            return(df)
        }

        polygons_IN <- lapply(polygons_OUT, mirror)
        polygons <- list(inside = polygons_IN, outside = polygons_OUT)
        
    # text labels
        labels <- data.frame(
            x = c(
                rep(-gap, 6),
                mean( c(-x[15], -x[22]) - gap*2 ),
                mean( c(x[15], x[22]) )
                ),
            y = c(
                mean(c(y[2], y[4])),
                mean(c(y[4], y[7])),
                mean(c(y[7], y[9])),
                mean(c(y[9], y[12])),
                mean(c(y[15], y[19])),
                y[22],
                rep(y[1]+0.1*abs(abs(y[1])-abs(y[15])), 2)
                ),
            label = c( parts, "INSIDE", "OUTSIDE" )
        )

    # run carb plotter
        if (!mode %in% modes) {
            stop(sprintf("%s is not an option. Available cases:\n%s\n", mode, paste(modes, collapse = ", ")))
        } else {
            if (mode == "by_site_and_period") {
                site_by_period <- carb %>% dplyr::select(period, site_name) %>% dplyr::distinct()
                plots <- site_by_period %>%
                    purrr::pmap(function(period, site_name) {
                        tryCatch({
                            plot_carb_pattern(
                                df = carb,
                                poly = polygons,
                                site = site_name,
                                period = period,
                                x = x,
                                y = y,
                                perimeter_IN = perimeter_IN,
                                perimeter_OUT = perimeter_OUT,
                                labels = labels,
                                cex = cex)
                        }, error = function(e) {
                            message(sprintf("Error for site '%s' and period '%s': %s", site_name, period, e$message))
                            return(NULL)
                        })

                    })
            } else if (mode == "by_site") {
                sites <- carb %>% dplyr::select(site_name) %>% dplyr::distinct()
                plots <- sites %>%
                    purrr::pmap(function( site_name ) {
                        tryCatch({
                            plot_carb_pattern(
                                df = carb,
                                poly = polygons,
                                site = site_name,
                                period = period,
                                x = x,
                                y = y,
                                perimeter_IN = perimeter_IN,
                                perimeter_OUT = perimeter_OUT,
                                labels = labels,
                                cex = cex)
                        }, error = function(e) {
                            message(sprintf("Error for site '%s': %s", site_name, e$message))
                            return(NULL)
                        })
                    })
            } else {
                periods <- carb %>% dplyr::select(period) %>% dplyr::distinct()
                plots <- periods %>%
                    purrr::pmap(function( period ) {
                        tryCatch({
                            plot_carb_pattern(
                                df = carb,
                                poly = polygons,
                                site = site_name,
                                period = period,
                                x = x,
                                y = y,
                                perimeter_IN = perimeter_IN,
                                perimeter_OUT = perimeter_OUT,
                                labels = labels,
                                cex = cex)
                        }, error = function(e) {
                            message(sprintf("Error for period '%s': %s", period, e$message))
                            return(NULL)
                        })
                    })
            }
        }
        
    # Cook the plots
        #plots <- flatten(plots)
        g <- gridExtra::arrangeGrob(grobs = plots, ncol = 2, nrow = 2)
        label_grob <- textGrob(
            "% of carbonized (total clear and carb). Darker color means larger percentage of carbonized sherds;",
            x = 0.65, y = 0.02,
            just = c("right", "bottom"),
            gp = gpar(fontsize = 12)
        )
        na_grob <- textGrob(
            " cross-lining means NA ",
            x = 0.65, y = 0.02,
            just = c("left", "bottom"),
            gp = gpar(fontsize = 12, col = "red")
        )

        final_grob <- gridExtra::arrangeGrob(
            g, label_grob, na_grob,
            ncol = 1,
            heights = unit(c(0.9, 0.05, 0.05), "npc")
        )

        results <- list(data = NULL, plot = final_grob)
        return(results)
}

Carbonization Pattern Plotter Auxillary Function:

plot_carb_pattern <- function(df, poly, site = NULL, period = NULL,
                                x, y, perimeter_IN, perimeter_OUT, labels,
                                cex = 10, gap = 250, na_col = "#b8241a") {
    # loadings
        period_colors <- list( "Shang" = "blue", "Zhou" = "aquamarine" ) # color settings of periods
        source("col_by_sides.r")
        source("load_pckgs.r")
        dependencies <- c("dplyr", "tidyr", "ggplot2", "ggpattern")
        invisible(lapply(dependencies, load_pkg))

    if (!is.null(site) && !is.null(period)) {
        if (nrow(df %>% dplyr::filter(site_name == {{site}} & period == {{period}})) == 0) {
            plot <- ggplot2::ggplot()
            return(plot)
            stop("No site, no period")
        } else {
            counts <- df %>%
                dplyr::filter(site_name == !!site & period == !!period) %>%
                dplyr::group_by(site_name, period, side, part, state) %>%
                dplyr::summarise(count = n(), .groups = "drop") %>%
                tidyr::pivot_wider(names_from = state, values_from = count, values_fill = list(count = 0)) %>%
                dplyr::mutate(
                    total = carb + clear,
                    prop_carb = round(100*carb / total, 2),
                    prop_clear = round(100*clear / total, 2),
                    color = round(255 * (1 - prop_carb / 100), 0),
                    hex = sprintf("#%02x%02x%02x", color, color, color),
                    part = factor(part, levels = parts),
                    period = factor(period, levels = c("Shang", "Zhou"))  
                ) %>%
                dplyr::arrange(period, part)

            title_text <- sprintf(
                "%s, <span style='color:%s;'>%s</span> period",
                site,
                period_colors[[period]],
                period
            )
        }
    } else if (!is.null(site) && is.null(period)) {
        counts <- df %>%
            dplyr::select( -period ) %>%
            dplyr::filter(site_name == !!site) %>%
            dplyr::group_by(site_name, side, part, state) %>%
            dplyr::summarise(count = n(), .groups = "drop") %>%
            tidyr::pivot_wider(
                names_from = state,
                values_from = count,
                values_fill = list(count = 0)
            ) %>%
            dplyr::mutate(
                total = carb + clear,
                prop_carb = round(100*carb / total, 2),
                prop_clear = round(100*clear / total, 2),
                color = round(255 * (1 - prop_carb / 100), 0),
                hex = sprintf("#%02x%02x%02x", color, color, color),
                part = factor(part, levels = parts)  
            ) %>%
            dplyr::arrange(site_name, part)

        title_text <- sprintf("%s, any period", site)
    } else if (is.null(site) && !is.null(period)) {
        counts <- df %>%
            dplyr::select( -site_name ) %>%
            dplyr::filter( period == !!period ) %>%
            dplyr::group_by( period, side, part, state ) %>%
            dplyr::summarise( count = n(), .groups = "drop" ) %>%
            tidyr::pivot_wider(names_from = state, values_from = count, values_fill = list(count = 0)) %>%
            dplyr::mutate(
                total = carb + clear,
                prop_carb = round(100*carb / total, 2),
                prop_clear = round(100*clear / total, 2),
                color = round(255 * (1 - prop_carb / 100), 0),
                hex = sprintf("#%02x%02x%02x", color, color, color),
                part = factor(part, levels = parts),
                period = factor(period, levels = c("Shang", "Zhou"))  
            ) %>%
            dplyr::arrange(period, part)

        title_text <- sprintf(
            "All sites, <span style='color:%s;'>%s</span> period",
            period_colors[[period]],
            period
        )
    }

    counts <- counts %>%
        dplyr::select(side, part, hex, carb, clear) %>%
        dplyr::mutate(text = paste0(round(100*carb/(carb+clear)),"% (", carb + clear, ")")) %>%
        dplyr::arrange(side, part)

    color_in <- col_by_sides(counts, "IN")
    color_out <- col_by_sides(counts, "OUT")
    colors <- list(inside = color_in, outside = color_out)

    numbers <- data.frame(
        number = c(colors$inside$text, colors$outside$text),
        x = c(
            -rep(( 1.9*gap ), 4),
            -x[17]-1.9*gap,
            -1.9*gap,
            -rep(( 0.1*gap ), 4),
            x[17]-0.2*gap,
            -0.1*gap
            ),
        y = rep(c(
            mean(c(y[2], y[4])),
            mean(c(y[4], y[7])),
            mean(c(y[7], y[9])),
            mean(c(y[9], y[12])),
            y[17],
            y[22]
            ), 2
        )       
    )

    p <- ggplot2::ggplot()

    for (i in seq_along(poly$inside)) {
        poly_data <- poly$inside[[i]]
        poly_color <- colors$inside$hex[i]

        if (poly_color == na_col) {
            p <- p + ggpattern::geom_polygon_pattern(
                data = poly_data, ggplot2::aes(x = x, y = y, pattern = "diagonal"),
                pattern_fill = "transparent",
                pattern_density = 1,
                pattern_angle = 45,
                fill = "transparent",
                color = "black",
                show.legend = FALSE
            )

        } else {
            p <- p + ggplot2::geom_polygon(
                data = poly_data, ggplot2::aes(x = x, y = y),
                fill = poly_color,
                color = "black"
            )
        }
    }

    for (i in seq_along(poly$outside)) {
        poly_data <- poly$outside[[i]]
        poly_color <- colors$outside$hex[i]

        if (poly_color == na_col) {
            p <- p + ggpattern::geom_polygon_pattern(
                data = poly_data,
                ggplot2::aes(x = x, y = y, pattern = "diagonal"),
                pattern_fill = "transparent",
                pattern_density = 1,
                pattern_angle = 45,
                fill = "transparent",
                color = "black",
                show.legend = FALSE
            )

        } else {
            p <- p + ggplot2::geom_polygon(
                data = poly_data, ggplot2::aes(x = x, y = y),
                fill = poly_color,
                color = "black"
            )
        }
    }

    plot <-p +
        ggplot2::coord_fixed(ratio = 1) + 
        ggplot2::geom_path(
            data = perimeter_OUT,
            ggplot2::aes(x = x, y = y),
            linewidth = 1
        ) +
        ggplot2::geom_path(
            data = perimeter_IN,
            ggplot2::aes(x = x, y = y),
            linewidth = 1
            ) +
        ggplot2::labs(x = NULL, y = NULL, title = title_text) +
        ggplot2::geom_segment(
            ggplot2::aes(
                x = 0,
                y = y[1]+0.1*abs(abs(y[1])-abs(y[22])),
                xend = 0,
                yend = y[22]-0.1*abs(abs(y[1])-abs(y[22]))
                ),
            linetype = "dashed",
            linewidth = 1.5,
            lineend = "round"
            ) +
        ggplot2::geom_segment(
            ggplot2::aes(
                x = -gap*2,
                y = y[1]+0.1*abs(abs(y[1])-abs(y[22])),
                xend = -gap*2,
                yend = y[22]-0.1*abs(abs(y[1])-abs(y[22]))
                ),
            linetype = "dashed",
            linewidth = 1.5,
            lineend = "round"
            ) +
        ggplot2::geom_text(
            data = labels[7:8,],
            ggplot2::aes(x = x, y = y, label = label),
            size = 0.5*cex
            ) +
        ggplot2:: geom_text(
            data = numbers[1:6,],
            ggplot2::aes(x = x, y = y, label = number),
            size=0.4*cex,
            hjust = 0,
            nudge_x = -0.2
            ) +
        ggplot2::geom_text(
            data = numbers[7:12,],
            ggplot2::aes(x = x, y = y, label = number),
            size=0.4*cex,
            hjust = 1,
            nudge_x = 0.2
            ) +
        ggplot2::xlim(-900, 400) +
        hrbrthemes::theme_ipsum(base_family = "opensans") +
        ggplot2::theme(
            plot.title = ggtext::element_markdown(size = 2*cex, face = "bold", hjust = 0.5),
            panel.grid.major = ggplot2::element_blank(),
            panel.grid.minor = ggplot2::element_blank(),
            axis.text.x = ggplot2::element_blank(),
            axis.text.y = ggplot2::element_blank()
            )

    return(plot)
}
source("carbPlotter.r")
carb <- carb_plotter(mode = "by_site_and_period", gap = 250)
plot_saver(carb$plot, outpath, "carb_pattern.png", 13.3, 10.7, 300)

Plot image

Dry or Wet Summaries

The function loads the raw data from info_and_carb_data_file_name (parameter set in settings.r) and performs calculations for dry/wet classification based on the conditions defined in dry_wet_conditions (set in settings.r).

Dry or Wet Function:

dryOrWet <- function(mode) {
    # loadings & settings
        source("buildCondition.r")
        source("load_pckgs.r")
        source("settings.r")
        source("plot_saver.r")
        dependencies <- c("readxl", "dplyr", "tidyr", "ggplot2", "rlang", "hrbrthemes", "ggtext", "ggpubr", "colorspace", "patchwork", "knitr")
        invisible(lapply(dependencies, load_pkg))
        grDevices::windowsFonts(opensans = windowsFont("Open Sans"))
        
        df <- readxl::read_excel(file.path(inpath, info_and_carb_data_file_name), na = "")

    group_vars <- switch(
        mode,
        "by_site_and_period" = c("site_name", "period"),
        "by_site" = "site_name",
        "by_period" = "period"
    )
    
    # case_when_string <- paste( "case_when(", buildCondition(dry_wet_conditions), ")" ) # with feet
    case_when_string <- paste( "case_when(", buildCondition(dry_wet_conditions[-8,]), ")" ) # without feet

    dry_or_wet <- df %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
        dplyr::mutate(
            state = eval(parse_expr(case_when_string))
        )

    summary <- dry_or_wet %>%
        dplyr::summarize(
            .groups = "drop",
            DRY = round(sum(state == -3) + 0.75*sum(state == -2) + 0.5*sum(state == -1), 0),
            WET = round(sum(state == 3) + 0.75*2*sum(state == 2) + 0.5*sum(state == 1), 0),
            na = sum(state == 0 ),
            total = sum(!is.na(state)),
            ident_rate = paste(round(100*(sum(!is.na(state)) - sum( state == 0 ))/sum(!is.na(state)), 0), "%"),
            dry3 = sum(state == -3),
            dry2 = sum(state == -2),
            dry1 = sum(state == -1),
            wet3 = sum(state == 3),
            wet2 = sum(state == 2),
            wet1 = sum(state == 1)
        )

    plot_data <- summary %>%
        tidyr::pivot_longer(
            cols = c(DRY, WET),
            names_to = "condition",
            values_to = "value"
        ) %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(group_vars)), condition) %>%
        dplyr::summarise(total = sum(value), .groups = 'drop') %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(group_vars))) %>%
        dplyr::mutate(percentage = total / sum(total) * 100)

    if(mode != "by_site") {
        plot_data <- plot_data %>%
            dplyr::mutate(period = factor(period, levels = c("Shang", "Zhou")))
    }

    facet_formula <- paste( "~", paste(group_vars, collapse = " + " ))

    piecharts <- 
        ggplot2::ggplot(
            plot_data,
            ggplot2::aes(y = percentage, fill = condition)
        ) +
        ggplot2::geom_bar(
            ggplot2::aes(x = factor(1)),
            stat = "identity",
            width = 1,
            colour = "white",
            linewidth = 1
        ) +
        ggplot2::geom_text(
            ggplot2::aes(x = factor(1), label = paste0(round(percentage), "%")),
            position = position_stack(vjust = 0.5),
            size = 4,
            color = "white"
        ) +
        ggplot2::coord_polar("y", start = 0) +
        ggplot2::facet_wrap(as.formula(facet_formula), ncol = 2) +
        ggplot2::labs(
            x = NULL,
            y = NULL,
            fill = "Condition",
            title = sprintf("DRY versus WET by %s", paste(group_vars, collapse = " and "))
        ) +
        ggplot2::scale_fill_manual(values = drywetColors) +
        hrbrthemes::theme_ipsum(base_family = "opensans") +
        ggplot2::theme(
            panel.grid.major = ggplot2::element_blank(),
            panel.grid.minor = ggplot2::element_blank(),
            axis.text.x = ggplot2::element_blank(),
            axis.text.y = ggplot2::element_blank(),
            axis.title.x = ggplot2::element_blank(),
            axis.title.y = ggplot2::element_blank(),
            strip.text = ggplot2::element_text(size = 15, face = "bold", hjust = 0.5),
            legend.position = "bottom",
            panel.spacing = ggplot2::unit(2, "lines"),
            plot.title = ggtext::element_markdown(size = 20, face = "bold", hjust = 0.5)
        )

    stat_data <- summary %>%
        dplyr::mutate(
            group = do.call(paste, c(dplyr::select(., dplyr::all_of(group_vars)), sep = ", "))
        ) %>%
        dplyr::select(group, DRY, WET)

    data_matrix <- as.matrix(stat_data[, c("DRY", "WET")])
    rownames(data_matrix) <- stat_data$group
    ftest <- fisher.test(data_matrix, alternative = "two.sided")
    pvalue <- format(ftest$p.value, scientific = FALSE, digits = 3)

    conditions <- dry_wet_conditions %>%
        dplyr::select(header, help, value)

    dryCOLindex <- which(names(summary) == "DRY")
    wetCOLindex <- which(names(summary) == "WET")
    
    
    table_sum <- 
        ggpubr::ggtexttable(
            summary,
            rows = NULL,
            theme = ggpubr::ttheme(
                "light",
                padding = ggplot2::unit(c(1.5, 6), "mm")
                )
        ) %>%
        ggpubr::table_cell_bg(
            row = 2:(nrow(summary) + 1),
            column = wetCOLindex,
            fill = colorspace::lighten(unname(drywetColors["WET"]), amount = 0.3),
            color = "white"
        ) %>%
        ggpubr::table_cell_bg(
            row = 2:(nrow(summary) + 1),
            column = dryCOLindex,
            fill = colorspace::lighten(unname(drywetColors["DRY"]), amount = 0.2),
            color = "white"
        )

    table_cond <- 
        ggpubr::ggtexttable(
            conditions,
            rows = NULL,
            theme = ggpubr::ttheme(
                "light",
                padding = ggplot2::unit(c(4, 6), "mm")
            )
        ) %>%
        ggpubr::table_cell_bg(
            row = 2:4,
            column = 1,
            fill=lighten(unname(drywetColors["DRY"]), amount = 0.2),
            color = "white"
        ) %>%
        ggpubr::table_cell_bg(
            row = 5:9,
            column = 1,
            fill=lighten(unname(drywetColors["WET"]), amount = 0.3),
            color = "white"
        )

    left_col <- (table_cond + theme(plot.margin = unit(c(0,0,100,0), "pt")))  / table_sum 
    combined_plot <- left_col | piecharts

    results <- list (data = dry_or_wet, plot = combined_plot)
    return(results)
}
source("dryOrWet.r")
drywet <- dryOrWet("by_site_and_period")
plot_saver(drywet$plot, outpath, "dry_wet.png", 6, 5, 300)

Plot image

Variables Overviewer

Here we bind all data we have from drywet, petrography, and residue analyses, and plot each variable.
For each variable name we added prefixes addressing their origin:
- i.* Information about the specimen
- m.* Measurements of the specimen
- c.* Carbonization coloration analysis
- g.* GCMS data
- p.* Petrography analysis data

Variables Overviewer Function:

overviewer <- function(inpath, savepath, save = FALSE, talky = TRUE) {
    ### LOADINGS
        ## dependencies
            source("settings.r")
            source("load_pckgs.r")
            source("parse_bounds.r")
            source("sum_non_numeric.r")
            source("string_processor.r")
            source("dryOrWet.r")
            dependencies <-  c("readxl", "writexl", "dplyr", "tidyr", "stringr", "scales", "ggplot2", "Hmisc", "ggcorrplot", "grid", "gridExtra", "viridis")
            invisible(lapply(dependencies, load_pkg))

        ## data
            raw_df <- readxl::read_excel(file.path(inpath, info_and_carb_data_file_name), na = "")
            raw_petro <- readxl::read_excel(file.path(inpath, petro_data_file_name), na = "")
            raw_arch <- dryOrWet("by_site_and_period")$data %>% dplyr::ungroup()
            raw_gcms <- readxl::read_excel(file.path(inpath, gcms_data_file_name), sheet = "Data", col_names = T, na = "")
        
            #if (talky) {
            #    if (all(sapply(mget(c("raw_arch", "raw_gcms", "raw_petro")), function(x) nrow(x) > 0))) {
            #        message("✅ All tables were loaded and contain records.")
            #    } else {
            #        warning("❌ One or more tables are empty!")
            #    }
            #}

        ## result list preparing
            results <- list(
                data = NULL,
                numeric_data = NULL,
                non_numeric_data = NULL,
                numeric_plots = NULL,
                non_numeric_plots = NULL,
                merged_plots = NULL
            )

    ### PROCESSING DATA
        # raw_arch
            arch <- raw_arch %>%
                dplyr::ungroup() %>%
                dplyr::rename(
                    site = site_name,
                    id = aux_id,
                    conf = state
                ) %>%
                dplyr::mutate(
                    group = dplyr::case_when(
                        conf < 0 ~ "dry",
                        conf > 0 ~ "wet",
                        conf == 0 ~ NA
                        ),
                    group = factor(group, levels = c("dry", "wet"), ordered = TRUE),
                    conf = dplyr::case_when(
                        abs(conf) == 3 ~ 1,
                        abs(conf) == 2 ~ 0.5,
                        abs(conf) == 1 ~ 0.3
                        ),
                    site = factor(site, levels = c("Gaoqingcaopo", "Kanjiazhai", "Yangxinliwu"), ordered = FALSE),
                    period = factor(period, levels = c("Shang", "Zhou"), ordered = TRUE),
                    lip = !is.na(col_in_lip) | !is.na(col_out_lip),
                    neck = !is.na(col_in_neck) | !is.na(col_out_neck),
                    shoulder = !is.na(col_in_shoulder) | !is.na(col_out_shoulder),
                    body = !is.na(col_in_body) | !is.na(col_out_body),
                    foot = !is.na(col_in_foot) | !is.na(col_out_foot),
                    crotch = !is.na(col_out_crotch),
                    completeness = round(100 * (lip + neck + shoulder + body + foot + crotch) / 6),
                    dplyr::across(dplyr::starts_with("col_"), ~ factor(tolower(as.character(.)))),
                    collection = paste0(period, ".", substr(as.character(site), 1, 1))
                ) %>%
                dplyr::mutate(
                    collection = factor(collection, levels = c("Shang.Y", "Shang.G", "Zhou.G", "Zhou.K"))
                ) %>%
                dplyr::select(
                    -c(record_id, sample_number, part, mouth_present, foot_present, vis_alysis, notes, decoration, residue_sampled, petro_sampled, question_field)
                    ) %>%
                dplyr::rename_with(~ paste0("i.", .), c(collection, site, period, lip, neck, shoulder, body, foot, crotch, completeness, cord_mark_type)) %>%
                dplyr::rename_with(~ paste0("m.", .), c(body_width, height, rim,  orifice, cord_mark_width)) %>%
                dplyr::rename_with(~ paste0("c.", .), c(dplyr::starts_with("col_"), group, conf))

        # raw_petro
            petro <- raw_petro %>%
                dplyr::select(-Photo, -`Interesting Incs`, -Site, -Period, -`Dry/Wet`, -`Dry/Wet Simple`, -Group) %>%
                dplyr::rename(
                    id = `Sample #`,
                    optical_activity = `Optical Activity`,
                    rock_frags = `Rock Frags`,
                    frag_size = `Frag Size`,
                    frag_roundness = `Frag Roundness`,
                    mica = Mica,
                    group = `Group Simplified`
                ) %>%
                dplyr::mutate(
                    optical_activity = ifelse(stringr::str_starts(stringr::str_to_lower(optical_activity), "A"), TRUE, FALSE),
                    frag_size = stringr::str_replace_all(frag_size, "[^0-9.<>=+-]", ""),
                    frag_roundness = stringr::str_to_lower(frag_roundness)
                )

            frag_size_clean <- parse_bounds(petro$frag_size) %>%
                dplyr::rename(
                    lower_fr_bound = lower,
                    upper_fr_bound = upper,
                    frag_size = original
                )

            frag_shape_clean <- dplyr::tibble(
                original = c("and-subang", "ang-subang", "ang-subrnd", "and-subrnd", "subang-subrnd", "subang-rnd", "ang-rnd", "rnd"),
                lower_roundness_bound = c("angular", "angular", "angular", "angular", "subangular", "subangular", "angular", "round"),
                upper_roundness_bound = c("subangular", "subangular", "subround", "subround", "subround", "round", "round", "round")
                )

            rock_dictionary <- petro %>%
                dplyr::filter(!is.na(rock_frags)) %>%
                dplyr::mutate(rock_frags = stringr::str_replace_all(rock_frags, " and ", ";")) %>%
                tidyr::separate_rows(rock_frags, sep = ";|/|-") %>%
                dplyr::mutate(rock_frags = stringr::str_trim(rock_frags)) %>%
                dplyr::filter(rock_frags != "") %>%
                dplyr::distinct(rock_frags)

            petro <- dplyr::bind_cols(petro, frag_size_clean) %>%
                dplyr::left_join(frag_shape_clean, by = c("frag_roundness" = "original")) %>%
                dplyr::mutate(
                    lower_roundness_bound = factor(
                        lower_roundness_bound,
                        levels = c("angular", "subangular", "subround", "round"),
                        ordered = TRUE
                        ),
                    upper_roundness_bound = factor(
                        upper_roundness_bound,
                        levels = c("angular", "subangular", "subround", "round"),
                        ordered = TRUE
                        ),
                    mica = factor(
                        mica,
                        levels = c("None", "No", "Rare", "Few", "Moderate", "Yes"),
                        ordered = TRUE
                        ),
                    group = factor(group),
                    granite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("granit", ignore_case = TRUE)), TRUE, FALSE),
                    granodiorite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("granodiorite", ignore_case = TRUE)), TRUE, FALSE),
                    diorite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("diorite", ignore_case = TRUE)), TRUE, FALSE),
                    sandstone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("sandstone", ignore_case = TRUE)), TRUE, FALSE),
                    mudstone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("mudstone", ignore_case = TRUE)), TRUE, FALSE),
                    limestone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("limestone", ignore_case = TRUE)), TRUE, FALSE),
                    gritstone = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("gritstone", ignore_case = TRUE)), TRUE, FALSE),
                    volcanic = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("volcanic", ignore_case = TRUE)), TRUE, FALSE),
                    andesite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("andesite", ignore_case = TRUE)), TRUE, FALSE),
                    microgranite = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("microgranite", ignore_case = TRUE)), TRUE, FALSE),
                    ksp = ifelse(!is.na(rock_frags) & stringr::str_detect(rock_frags, regex("ksp", ignore_case = TRUE)), TRUE, FALSE)
                ) %>%
                dplyr::select( -frag_roundness, -rock_frags, - dplyr::starts_with("frag_size")) %>%
                dplyr::rename_with(~ paste0("p.", .), -id)

        # raw_gcms
            chumpsy_vars <- c("aapa", "phy", "tmtd", "cholesterol_bi_products", "miliacin", "ergostanol", "dha", "lc_ketones", "pah", "bpca", "contaminants")
            gcms <- raw_gcms %>%
                dplyr::rename_with(tolower) %>%
                dplyr::rowwise() %>%
                dplyr::rename(
                    lipids_conc = ug_g,
                    group = preliminary_interpretation,
                    conf = level_of_confidence,
                    srr = `srr_%` 
                    ) %>%
                dplyr::mutate(
                    id = sub("A$", "", id),
                    plant_count = sum(tolower(dplyr::c_across(dplyr::all_of(chumpsy_vars))) == 'p', na.rm = TRUE),
                    tree_count = sum(tolower(dplyr::c_across(dplyr::all_of(chumpsy_vars))) == 'tr', na.rm = TRUE),
                    cereal = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("cereal", ignore_case = TRUE)), TRUE, FALSE),
                    fruit = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("fruit", ignore_case = TRUE)), TRUE, FALSE),
                    vegetable = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("vegetab", ignore_case = TRUE)), TRUE, FALSE),
                    resin = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("resin", ignore_case = TRUE)), TRUE, FALSE),
                    fish = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("fish", ignore_case = TRUE)), TRUE, FALSE),
                    complex_mxtr = ifelse(!is.na(comment) & stringr::str_detect(comment, regex("complex", ignore_case = TRUE)), TRUE, FALSE),
                    conf = dplyr::case_when(
                        conf == 1 ~ 1,
                        conf == 2 ~ 0.5,
                        conf == 3 ~ 0.3
                        ),
                    group = dplyr::case_when(
                        group == 1 ~ "plant",
                        group == 2 ~ "mixture",
                        group == 3 ~ "animal",
                        ),
                    group = factor(
                        group,
                        levels = c("plant", "mixture", "animal"),
                        ordered = TRUE
                        )
                    ) %>%
                dplyr::ungroup() %>%
                dplyr::select(-dynastie, -dplyr::all_of(chumpsy_vars), -comment) %>%
                dplyr::rename_with(~ paste0("g.", .), -id)

        # All the tables are ready for joining nopw
           info.variable_prefix <- c(
                "ℹ️ Prefixes were added to each variable name addressing their function or derivation",
                "\t i.* = Information about the specimen",
                "\t m.* = Measurements of the specimen",
                "\t c.* = Carbonization coloration analysis",
                "\t g.* = GCMS data",
                "\t p.* = Petrography analysis data"
            )

            #if (talky) {
            #    for (line in info.variable_prefix) {
            #        cat(line, "\n")
            #    }
            #}

        # Fusing
            data <- arch %>%
                dplyr::inner_join(gcms, by = "id") %>%
                dplyr::inner_join(petro, by = "id")
            
            if (nrow(data) == 0) {
                stop("❌ data has no records!")
            } else {
                results$data <- data
            }

            

            if (talky) {
                message("⚠️ inner joining was used to link the data, use left_join to conserve unmatched records")
                
                intersected_ids <- length(Reduce(intersect, list(arch$id, gcms$id, petro$id)))
            
                cat("Petrography:", nrow(petro), "records\n")
                cat("GCMS:", nrow(gcms), "records\n")
                cat("Carbonisation:", nrow(arch), "records\n\n")
                cat("Max number of intersected id-s:", intersected_ids, "\n")
                cat("Number of records in the final table:", nrow(data), "\n")
                #dplyr::glimpse(data)
                cat("\n\n📦 Variable Descriptions:\n\n")
                for (var in names(var_info)) {
                    info <- var_info[[var]]
                    cat(sprintf("📌 %s:\n", var))
                    cat(sprintf("  🔧 Type: %s", info$type))
                    if (!is.null(info$units)) cat(sprintf(" | 📏 Units: %s", info$units))
                    cat("\n")
                    cat(sprintf("  📝 Description: %s\n", info$descr))
                    
                    if (var %in% names(data)) {
                        values <- data[[var]]
                        options <- if (is.numeric(values)) {
                                paste("from", round(min(values, na.rm = TRUE), 1), "to", round(max(values, na.rm = TRUE), 1))
                            } else {
                                vals <- unique(values)
                                lvl <- levels(values)
                                ifelse(
                                    is.null(lvl),
                                    paste(vals, collapse = ", "),
                                    paste(lvl, collapse = ", ")
                                )
                            }
                        cat(sprintf("  🔢 Values: %s\n", options))
                    } else {
                        cat("  ⚠️ Variable not found in 'data'\n")
                    }
                    cat("\n")
                }
            }

    # ANALYSIS

# Overview plotting
    # Select numeric data
        numeric_data <- data %>% 
            dplyr::select(dplyr::where(is.numeric), i.collection) %>% 
            dplyr::select(-i.completeness) %>%
            tidyr::pivot_longer(
                cols = -i.collection,
                names_to = "variable",
                values_to = "value"
            ) %>% 
            dplyr::rename(group = i.collection) %>% 
            dplyr::filter(!variable %in% excluded_vars)

    # Create boxplot for each numeric variable
        numeric_plots <- list()

        for (var in unique(numeric_data$variable)) {
            # fetch the var data
                df_var <- numeric_data %>% dplyr::filter(variable == var) %>% dplyr::filter(is.finite(value))
                info <- var_info[[var]]
            
            # count sample sizes
                counts <- df_var %>%
                    dplyr::group_by(group) %>%
                    dplyr::summarize(
                        n = sum(!is.na(value)),
                        y_min = ifelse(all(is.na(value)), NA, min(value, na.rm = TRUE)),
                        y_max = ifelse(all(is.na(value)), NA, max(value, na.rm = TRUE)),
                        .groups = "drop"
                    ) %>%
                    dplyr::mutate( # If all values фку identical or NA, offset to 0
                        range   = ifelse(!is.na(y_min) & y_min != y_max, y_max - y_min, 0),
                        label_y = y_min - 0.05 * range   # 5% below the min
                    )

            # Boxplot the variable
                p <- ggplot2::ggplot(df_var, ggplot2::aes(x = group, y = value)) +
                    ggplot2::geom_boxplot() +
                    ggplot2::geom_text(
                        data = counts,
                        aes(x = group, y = label_y, label = paste("n =", n)),
                        vjust = 1,
                        size = 3
                    ) +
                    ggplot2::labs(
                        title = var,
                        subtitle = stringr::str_wrap(info$descr, width = 60),
                        x = NULL,
                        y = "Value"
                    ) +
                    ggplot2::theme_minimal() +
                    coord_cartesian(clip = "off") +
                    theme(
                        axis.text  = element_text(size = 6),
                        axis.title = element_text(size = 6),
                        plot.title = element_text(size = 10),
                        plot.subtitle = element_text(size = 6),
                        plot.margin = unit(c(5, 5, 20, 5), "pt")
                    )
            # save the plot to the list
                numeric_plots[[var]] <- p
        }

    # Summarise non-numeric variables
        sum_tables <- sum_non_numeric(data, "i.completeness")
        
        non_numeric_plots <- list()

        for (var in names(sum_tables)) {
            # fetch the var data
                df <- sum_tables[[var]] %>%
                    tidyr::pivot_longer(
                        cols = -group,
                        names_to = "Option",
                        values_to = "Count"
                        )

                info <- var_info[[var]]

            # restore the factor levels
                if (var %in% names(data) && is.factor(data[[var]])) {
                    desired_levels <- levels(data[[var]])
                    df$Option <- factor(df$Option, levels = desired_levels)
                } else {
                    df$Option <- factor(df$Option, levels = sort(unique(df$Option)))
                }

            # create a grouped bar plot
                p <- ggplot2::ggplot(
                    df,
                    ggplot2::aes(x = group, y = Count, fill = Option)) +
                    ggplot2::geom_bar(stat = "identity", position = ggplot2::position_dodge()) +
                    ggplot2::labs(
                        title = var,
                        subtitle = stringr::str_wrap(info$descr, width = 60),
                        x = NULL,
                        y = "Count"
                    ) +
                    ggplot2::scale_fill_viridis_d() +
                    ggplot2::theme_minimal() +
                    ggplot2::theme(
                        axis.text = ggplot2::element_text(size = 6),
                        axis.title = ggplot2::element_text(size = 6),
                        plot.title = ggplot2::element_text(size = 10),
                        plot.subtitle = ggplot2::element_text(size = 6),
                        axis.title.x = ggplot2::element_blank(),
                        legend.text = ggplot2::element_text(size = 8),
                        legend.title = ggplot2::element_text(size = 8)
                    )

                non_numeric_plots[[var]] <- p
        }

    # Save the results
        results$numeric_data <- numeric_data
        results$non_numeric_data <- sum_tables
        results$numeric_plots <- numeric_plots
        results$non_numeric_plots <- non_numeric_plots

    # Combine plots by prefix
        group_plots     <- non_numeric_plots[  grepl("group", names(non_numeric_plots)) ]
        other_plots <- non_numeric_plots[ !grepl("group", names(non_numeric_plots)) ]
        non_group_plots <- c(numeric_plots, other_plots)
        
        get_prefix <- function(name) sub("\\..*", "", name)

        plot_names <- names(non_group_plots)
        prefixes <- unique(get_prefix(plot_names))
        excluded_prefixes <- NULL
        prefixes <- setdiff(prefixes, excluded_prefixes)
        
        merged_plots <- list()

        if (!dir.exists(file.path(savepath, "overview"))) {
        dir.create(file.path(savepath, "overview"), recursive = TRUE)
        }

        for (prefix in prefixes) {
            plots_for_prefix <- non_group_plots[ sapply(plot_names, function(x) get_prefix(x) == prefix) ]
            file_name <- file.path(savepath, "overview", paste0(prefix, "_plots.png"))

            merged_plot <- gridExtra::arrangeGrob(grobs = plots_for_prefix, ncol = 4)

            #tiff(filename = file_name, width = 30, height = 20, units = "cm", res = 300)
            png(filename = file_name, width = 30, height = 20, units = "cm", res = 300)
            grid::grid.draw(merged_plot)
            dev.off()
            
            merged_plots[[prefix]] <- merged_plot
        }

       if (length(group_plots) > 0) {
            file_name <- file.path(savepath, "overview", "group_plots.png")
            merged_group_plot <- gridExtra::arrangeGrob(grobs = group_plots, ncol = length(group_plots))
            #tiff(filename = file_name, width = 30, height = 10, units = "cm", res = 300)
            png(filename = file_name, width = 30, height = 10, units = "cm", res = 300)
            grid::grid.draw(merged_group_plot)
            dev.off()
            }

        results$merged_plots <- merged_plots
        #if (talky) cat("💾 Files have been saved to", outpath)
        return(results)
}

Variables description & data inspection

source("overviewer.r")
r <- overviewer(inpath, outpath, talky = T)
## Petrography: 103 records
## GCMS: 105 records
## Carbonisation: 371 records
## 
## Max number of intersected id-s: 103 
## Number of records in the final table: 103 
## 
## 
## 📦 Variable Descriptions:
## 
## 📌 id:
##   🔧 Type: text id
##   📝 Description: Unique sample id
##   🔢 Values: Y15, Y19, Y20, Y26, Y28, Y29, Y33, Y34, Y40, Y49, Y50, Y51, Y52, Y62, Y78, Y83, Y84, Y85, Y91, Y117, Y119, Y134, Y151, Y152, Y156, Y169, Y172, Y179, Y215, Y217, G4, G5, G9, G10, G19, G20, G21, G22, G27, G28, G30, G34, G35, G39, G41, G42, G47, G48, G49, G55, G56, G60, G61, G62, G63, G66, G67, G70, G71, G73, G75, G80, G81, G87, G88, G90, G91, G93, G95, G96, G97, G98, G99, G104, K6, K7, K8, K9, K10, K11, K12, K13, K14, K15, K16, K17, K18, K19, K20, K21, K22, K23, K24, K25, K26, K27, K28, K29, K30, K31, K32, K33, K34
## 
## 📌 i.site:
##   🔧 Type: txt fct
##   📝 Description: The name of the site
##   🔢 Values: Gaoqingcaopo, Kanjiazhai, Yangxinliwu
## 
## 📌 i.period:
##   🔧 Type: txt ord
##   📝 Description: The dynastie
##   🔢 Values: Shang, Zhou
## 
## 📌 i.lip:
##   🔧 Type: lgl
##   📝 Description: TRUE if lip is present in a sample
##   🔢 Values: TRUE, FALSE
## 
## 📌 i.neck:
##   🔧 Type: lgl
##   📝 Description: TRUE if neck is present in a sample
##   🔢 Values: TRUE, FALSE
## 
## 📌 i.shoulder:
##   🔧 Type: lgl
##   📝 Description: TRUE if shoulder is present in a sample
##   🔢 Values: TRUE, FALSE
## 
## 📌 i.body:
##   🔧 Type: lgl
##   📝 Description: TRUE if body is present in a sample
##   🔢 Values: TRUE, FALSE
## 
## 📌 i.foot:
##   🔧 Type: lgl
##   📝 Description: TRUE if foot is present in a sample
##   🔢 Values: FALSE, TRUE
## 
## 📌 i.crotch:
##   🔧 Type: lgl
##   📝 Description: TRUE if crotch is present in a sample
##   🔢 Values: FALSE, TRUE
## 
## 📌 i.completeness:
##   🔧 Type: num dbl | 📏 Units: %%
##   📝 Description: 100 means all parts are present in a sample, less means a fragment is missing some parts
##   🔢 Values: from 33 to 100
## 
## 📌 i.collection:
##   🔧 Type: txt fct
##   📝 Description: A combination of period and a first letter of the site
##   🔢 Values: Shang.Y, Shang.G, Zhou.G, Zhou.K
## 
## 📌 m.body_width:
##   🔧 Type: num dbl | 📏 Units: mm
##   📝 Description: Width of the body
##   🔢 Values: from 171 to 171
## 
## 📌 m.height:
##   🔧 Type: num dbl | 📏 Units: mm
##   📝 Description: Height of the vessel fragment
##   🔢 Values: from 11.5 to 15
## 
## 📌 m.rim:
##   🔧 Type: num dbl | 📏 Units: mm
##   📝 Description: The size of the rim
##   🔢 Values: from 15 to 57
## 
## 📌 m.orifice:
##   🔧 Type: num dbl | 📏 Units: mm
##   📝 Description: The size of the orifice
##   🔢 Values: from 10 to 52
## 
## 📌 m.cord_mark_width:
##   🔧 Type: num dbl | 📏 Units: mm
##   📝 Description: The width of a cordmark
##   🔢 Values: from 1 to 17
## 
## 📌 m.cord_mark_type:
##   🔧 Type: txt fct | 📏 Units: mm
##   📝 Description: Cordmark type
##   ⚠️ Variable not found in 'data'
## 
## 📌 c.col_in_lip:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the lip part from inside
##   🔢 Values: carb, clear
## 
## 📌 c.col_in_neck:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the neck part from inside
##   🔢 Values: carb, clear, water
## 
## 📌 c.col_in_shoulder:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the shoulder part from inside
##   🔢 Values: carb, clear, water
## 
## 📌 c.col_in_body:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the body part from inside
##   🔢 Values: carb, clear, water
## 
## 📌 c.col_out_lip:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the lip part from outside
##   🔢 Values: carb, clear
## 
## 📌 c.col_out_neck:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the neck part from outside
##   🔢 Values: carb, clear, oxide
## 
## 📌 c.col_out_shoulder:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the shoulder part from outside
##   🔢 Values: carb, clear, oxide
## 
## 📌 c.col_out_body:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the body part from outside
##   🔢 Values: carb, clear, oxide, water
## 
## 📌 c.col_in_foot:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the foot part from inside
##   🔢 Values: carb, clear, water
## 
## 📌 c.col_out_foot:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the foot part from outside
##   🔢 Values: carb, clear, oxide
## 
## 📌 c.col_out_crotch:
##   🔧 Type: txt fct
##   📝 Description: A coloration of the crotch part from outside
##   🔢 Values: carb, clear, oxide
## 
## 📌 c.group:
##   🔧 Type: txt ord
##   📝 Description: A classification according to carbonisation analysis
##   🔢 Values: dry, wet
## 
## 📌 c.conf:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: A confidence level for 'Dry or Wet' classification, 1 means dead certainty
##   🔢 Values: from 0.3 to 1
## 
## 📌 g.lipids_conc:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: A concentration of lipids in a sample, promille?
##   🔢 Values: from 0.9 to 14992.7
## 
## 📌 g.scfa:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: Relative abundance of short-chain fatty acids (C2:C6)
##   🔢 Values: from 1.5 to 21.4
## 
## 📌 g.mcfa:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: Relative abundance of medium-chain fatty acids (C6:C12)
##   🔢 Values: from 19.5 to 83.5
## 
## 📌 g.lcfa:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: Relative abundance of long-chain fatty acids (>C12)
##   🔢 Values: from 0 to 50.9
## 
## 📌 g.uns_fa:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: Relative abundance of unsaturated fatty acids
##   🔢 Values: from 0 to 33.2
## 
## 📌 g.diacids:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: Relative abundance of dicarboxylic acids: oxidized fatty acids with two carboxyl groups
##   🔢 Values: from 0 to 14.8
## 
## 📌 g.alkanes:
##   🔧 Type: num dbl | 📏 Units: %
##   📝 Description: Relative abundance of saturated hydrocarbons
##   🔢 Values: from 0 to 73.7
## 
## 📌 g.p_s:
##   🔧 Type: num dbl
##   📝 Description: Pristane/Phytane ratio ??
##   🔢 Values: from 0.6 to 3.4
## 
## 📌 g.o_s:
##   🔧 Type: num dbl
##   📝 Description: Odd-over-even carbon number predominance ??
##   🔢 Values: from 0 to 1.3
## 
## 📌 g.aapac18e_h:
##   🔧 Type: num dbl
##   📝 Description: An index related to C18 alkylphenanthrene homologs (E/H variants) ??
##   🔢 Values: from 1.1 to 12.3
## 
## 📌 g.srr:
##   🔧 Type: num dbl
##   📝 Description: A sterane ratio expressed as a percentage ??
##   🔢 Values: from 42.6 to 77.6
## 
## 📌 g.group:
##   🔧 Type: txt ord
##   📝 Description: A classification for organics source based on expert analysis
##   🔢 Values: plant, mixture, animal
## 
## 📌 g.conf:
##   🔧 Type: num dbl
##   📝 Description: A confidence level for organic source classification (g.group), 1 means dead certainty
##   🔢 Values: from 0.3 to 1
## 
## 📌 g.plant_count:
##   🔧 Type: num int
##   📝 Description: A rowwise count for 'plant'
##   🔢 Values: from 0 to 7
## 
## 📌 g.tree_count:
##   🔧 Type: num int
##   📝 Description: A rowwise count for 'tree'
##   🔢 Values: from 0 to 1
## 
## 📌 g.cereal:
##   🔧 Type: lgl
##   📝 Description: True if 'cereal' was mention in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 g.fruit:
##   🔧 Type: lgl
##   📝 Description: True if 'fruit' was mention in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 g.vegetable:
##   🔧 Type: lgl
##   📝 Description: True if 'vegetable' was mention in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 g.resin:
##   🔧 Type: lgl
##   📝 Description: True if 'resin' was mention in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 g.fish:
##   🔧 Type: lgl
##   📝 Description: True if 'fish' was mention in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 g.complex_mxtr:
##   🔧 Type: lgl
##   📝 Description: True if 'complex mixture' was mention in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.optical_activity:
##   🔧 Type: lgl
##   📝 Description: True if optical activity is 'Active'
##   🔢 Values: FALSE
## 
## 📌 p.mica:
##   🔧 Type: txt ord
##   📝 Description: The relative abundance of mica minerals
##   🔢 Values: None, No, Rare, Few, Moderate, Yes
## 
## 📌 p.group:
##   🔧 Type: txt fct
##   📝 Description: A classification for petrography based on expert analysis
##   🔢 Values: CPX Sand, Feldspar Sand, Feldspar Sand + Lime, Fine Silt, Granitic + Sed, Granitic Sand, Granodiorite, Metamorphic, Sandstone, Tonalite, Volcanic
## 
## 📌 p.lower_fr_bound:
##   🔧 Type: num dbl | 📏 Units: mm
##   📝 Description: A lower bound for rock fragments approximate size range
##   🔢 Values: from 0 to 1
## 
## 📌 p.upper_fr_bound:
##   🔧 Type: num dbl
##   📝 Description: An upper bound for rock fragments approximate size range
##   🔢 Values: from 0.1 to 6
## 
## 📌 p.lower_roundness_bound:
##   🔧 Type: txt ord
##   📝 Description: A lower bound for rock roundndess
##   🔢 Values: angular, subangular, subround, round
## 
## 📌 p.upper_roundness_bound:
##   🔧 Type: txt ord
##   📝 Description: An upper bound for rock roundndess
##   🔢 Values: angular, subangular, subround, round
## 
## 📌 p.granite:
##   🔧 Type: lgl
##   📝 Description: True if 'granite' was mentioned in the comments
##   🔢 Values: TRUE, FALSE
## 
## 📌 p.granodiorite:
##   🔧 Type: lgl
##   📝 Description: True if 'granodiorite' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.diorite:
##   🔧 Type: lgl
##   📝 Description: True if 'diorite' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.sandstone:
##   🔧 Type: lgl
##   📝 Description: True if 'sandstone' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.mudstone:
##   🔧 Type: lgl
##   📝 Description: True if 'mudstone' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.limestone:
##   🔧 Type: lgl
##   📝 Description: True if 'limestone' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.gritstone:
##   🔧 Type: lgl
##   📝 Description: True if 'gritstone' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.volcanic:
##   🔧 Type: lgl
##   📝 Description: True if 'volcanic' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.andesite:
##   🔧 Type: lgl
##   📝 Description: True if 'andesite' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.microgranite:
##   🔧 Type: lgl
##   📝 Description: True if 'microgranite' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
## 
## 📌 p.ksp:
##   🔧 Type: lgl
##   📝 Description: True if 'ksp' was mentioned in the comments
##   🔢 Values: FALSE, TRUE
overvw_plots <- list.files(file.path(outpath, "overview"), full.names = TRUE)
imgs_markdown <- paste0("![](", overvw_plots, ")", collapse = "\n\n")
asis_output(imgs_markdown) # Generate clickableimage HTML tags imgs_html 

Spectral analysis

Overview

GCMS data is represented with a continuous set of mass-spectra, m/z spectra specifically, where m stands for mass, and z stands for charge number of ions. We can present the GCMS data for a random spectrum file as a heatmap, where larger intensity would mean darker color, and where retention time would be X axis, and m/z would be Y axis. We could first look at the summed projections of such heatmap: summing by m(Y) gives us Total Ion Chromotogram (TIC), i.e. how many ions were detected for each time; summing by retention time (X) gives us general m/z spectrum of the sample.

Spectra Checker Function:

spec_checker <- function(in_path, spec_path) {
    # List mzML files
        mzml_list <- list.files(path = spec_path, pattern = "\\.mzML$", full.names = TRUE)
        if (length(mzml_list) == 0) stop("No mzML files were found in the folder")

    # loadings
        source("settings.r")
        source("load_pckgs.r")
        dependencies <-  c("readxl", "dplyr", "stringr")
        invisible(lapply(dependencies, load_pkg))

    # read metadata
        df <- readxl::read_excel(file.path(in_path, info_and_carb_data_file_name), na = "") %>%
            dplyr::ungroup() %>%
            dplyr::rename(
                site = site_name,
                id   = aux_id
            ) %>%
            dplyr::select(id, period, site)

    # process filenames
        mzml_tbl <- dplyr::tibble(path = mzml_list) %>%
            dplyr::mutate(
                filename = basename(mzml_list),
                prefix   = stringr::str_extract(filename, "^[A-Za-z]"),
                num      = stringr::str_extract(filename, "(?<=^[A-Za-z])\\d+"),
                postfix  = stringr::str_extract(filename, "(?<=\\d)(.*)(?=\\.mzML)"),
                id       = paste0(prefix, num)
            ) %>%
            dplyr::select(path, id)

    # Check filenames
        missing_ids <- setdiff(mzml_tbl$id, df$id)
        if (length(missing_ids) != 0) warning("Following files were not matched: ", paste(missing_ids, collapse = ", "))

    # Merge metadata and file information:
        data <- inner_join(df, mzml_tbl, by = "id")
        if (nrow(data) == 0) stop("No filename matches the metadata ids.")
        cat("Number of matched mzML files:", nrow(data), "\n")

    return(data)
}

Spectrum Overviewer Function:

spec_overviewer <- function(index_tbl, num) { 
    # loadings
        source("settings.r")
        source("load_pckgs.r")
        dependencies <- c("dplyr", "mzR", "ggplot2", "gridExtra", "magick")
        invisible(lapply(dependencies, load_pkg))
    
    # check if the specified file exists
        file_path <- index_tbl$path[[num]]
        if (!file.exists(file_path)) {
            stop("The file ", file_path, " does not exist.")
        }
        file_name <- basename(file_path)
    
    # open mzML file and extract info
        message("Fetching data...")
        ms <- openMSfile(file_path)
        hdr <- header(ms)
        peaks_data <- peaks(ms, 1)
        nScans <- nrow(hdr)

    # define the future plot parameters
        rt_min <- min(hdr$retentionTime)
        rt_max <- max(hdr$retentionTime)
       # nScans <- length(hdr$retentionTime)

        rt_bins <- seq(rt_min, rt_max, length.out = nScans + 1)
        rt_midpoints <- (rt_bins[-1] + rt_bins[-length(rt_bins)])/2
        rt_limits <- c(rt_min, rt_max)

        mz_min <- min(peaks_data[, 1])
        mz_max <- max(peaks_data[, 1])
        nBinsMz <- round(mz_max- mz_min) + 1
        mz_bins <- seq(mz_min, mz_max, length.out = nBinsMz + 1)
        mz_midpoints <- (mz_bins[-1] + mz_bins[-length(mz_bins)])/2
    
    # create an empty matrix to styore the intensities
        heatmap_matrix <- matrix(0, nrow = nBinsMz, ncol = nScans)
    
    # populate each scan's data to the heatmap matrix

        for (i in seq_len(nScans)) {
            rt <- hdr$retentionTime[i]
            rt_bin <- findInterval(rt, rt_bins, rightmost.closed = TRUE)
            if (rt_bin < 1 || rt_bin > nScans) next
            
            peaks_data <- peaks(ms, i)
            if (nrow(peaks_data) == 0) next
            
            for (j in 1:nrow(peaks_data)) {
            mz_val <- peaks_data[j, 1]
            intensity <- peaks_data[j, 2]
            mz_bin <- findInterval(mz_val, mz_bins, rightmost.closed = TRUE)
            if (mz_bin < 1 || mz_bin > nBinsMz) next
            
            heatmap_matrix[mz_bin, rt_bin] <- heatmap_matrix[mz_bin, rt_bin] + intensity
            }
        }
    
    # log-transformation
        heatmap_matrix_log <- log10(heatmap_matrix + 1)
    
    # plotting
        message("Plots building...")
        # plot 1
            heatmap <- data.frame(
                rt = rep(rt_midpoints, each = nBinsMz),
                mz = rep(mz_midpoints, times = nScans),
                intensity = as.vector(heatmap_matrix_log)
                )

            p1 <- ggplot(
                    heatmap,
                    aes(x = rt, y = mz, fill = intensity)
                ) +
                geom_tile() +
                scale_fill_gradient(
                    low = "white",
                    high = "black",
                    name = "log10(Intensity)"
                ) +
                labs(
                    title = paste("Intensity Spectrum for ", file_name, "(log10)"), 
                    x = "Retention Time (sec)",
                    y = "m/z"
                    ) +
                    scale_x_continuous(limits = rt_limits, expand = c(0, 0)) +
                theme_minimal() +
                theme(legend.position = "none")
    
        # plot 2 
            tic <- hdr$totIonCurrent
            df_tic <- data.frame(rt = hdr$retentionTime, tic = tic)
            
            p2 <- ggplot(
                df_tic,
                aes(x = rt, y = tic)
                ) +
                geom_line(col = "blue") +
                labs(
                    title = paste("Total Ion Current Chromatogram for ", file_name), 
                    x = "Retention Time (sec)",
                    y = "Total Ion Current"
                ) +
                scale_x_continuous(limits = rt_limits, expand = c(0,0)) +
                theme_minimal()
            
        # plot 3
            mass_spectrum <- rep(0, nBinsMz)

            for (i in 1:nScans) {
                peaks_data <- peaks(ms, i)
                if (nrow(peaks_data) == 0) next
                
                for (j in 1:nrow(peaks_data)) {
                    mz_val <- peaks_data[j, 1]
                    intensity <- peaks_data[j, 2]
                    mz_bin <- findInterval(mz_val, mz_bins, rightmost.closed = TRUE)
                    
                    if (mz_bin >= 1 && mz_bin <= nBinsMz) {
                        mass_spectrum[mz_bin] <- mass_spectrum[mz_bin] + intensity
                    }
                }
            }

            mass_spectrum <- data.frame(mz = mz_midpoints, intensity = mass_spectrum)

            p3 <- ggplot(
                mass_spectrum,
                aes(x = mz, y = intensity)
                ) +
                geom_line(col = "red") +
                labs(
                    title = paste("Summed Mass Spectrum for ", file_name),
                    x = "Summed Intensity",
                    y = "m/z"
                    ) +
                theme_minimal() +
                coord_flip()
    
    # close the mzML file
        mzR::close(ms)
    
    # grid plotting
        layout_matrix <- matrix(c(3, 1, NA, 2), nrow = 2, byrow = TRUE)
        combined_plot <- gridExtra::arrangeGrob(p1, p2, p3, layout_matrix = layout_matrix)
        

    #    ggsave(
    #        filename = save_path,
    #        plot = combined_plot,
    #        width = 16,
    #        height = 16,
    #        dpi = 300
   #     )

    #    message("Image saved to: ", basename(save_path))
   #     img <- magick::image_read(save_path)
   #     plot(img)

    results <- list(plot = combined_plot)
    return(results)

}
source("spec_checker.r")
spec_index <- spec_checker(inpath, path) # checks the files and returns the list of verified paths & metadata
## Number of matched mzML files: 105
#print(spec_index)

source("spec_overviewer.r")
specovrw <- spec_overviewer(spec_index, 1) # provides a visualization of GCMS data for the first spectrum in the folder
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.13)
## than is installed on your system (1.0.14). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
plot_saver(specovrw$plot, outpath, "spectrum_overview.png", 10, 10, 100)

Plot image

PCA

We can now compare samples’ TICs using Principal component analysis in order to find any clustering pattern achivied by rotation the initial variables (ion intensity per each retention time) and presenting them as principal components, which we then could attempt to interpret. The function ticsnpeaks fetches the data, specifically, TICs and peaks:
TICs are total ion chromatograms (the summed result of sample scanning), peaks are retention time ranges where these tics were showing some significant evaporation, the sdnum parameter set to 3 means that ticsnpeaks function selects only those retention times where values exceed the value of mean tic + 3 standard deviations.

Teaks&Peaks Funtion:

ticsnpeaks <- function(index_tbl, sdnum = 3) {
    # loadings
        # dependencies
            source("settings.r")
            source("load_pckgs.r")
            dependencies <-  c("readxl", "dplyr", "mzR", "pracma")
            invisible(lapply(dependencies, load_pkg))

    # create empty lists
        tic_list <- list()
        peak_list  <- list()
  
    # loop over each sample in the index table
        for (file in index_tbl$path) {
            cat("Processing file:", basename(file), "\n")
            
            ms <- mzR::openMSfile(file)
            tic <- mzR::tic(ms)
            tic_list[[basename(file)]] <- tic

            threshold <- mean(tic$tic) + sdnum * sd(tic$tic)

            # Identify all indices above threshold
            idx_above <- which(tic$tic > threshold)

            if (length(idx_above) == 0) {
                cat("No peaks detected above the threshold.\n")
            } else {
                runs <- rle(diff(idx_above))  # find contiguous peaks
                break_positions <- which(runs$values != 1) # > 1 mean new peak region
                run_ends <- cumsum(runs$lengths)
                boundaries <- c(0, run_ends[break_positions], length(idx_above))
                
                #  Loop over each contiguous peaks to find start, max, and end
                peak_temp_list <- list()
                
                for (i in seq_along(boundaries)[-1]) {
                    start_in_run <- boundaries[i-1] + 1
                    end_in_run   <- boundaries[i]
                    these_idx    <- idx_above[start_in_run:end_in_run]
                    
                    peak_start <- min(these_idx)
                    peak_end   <- max(these_idx)

                    local_max_index <- which.max(tic$tic[these_idx])
                    peak_max <- these_idx[local_max_index]
                    
                    # Extract the spectrum at the scan with max tic
                        sp <- mzR::peaks(ms, scan = peak_max)
                        if (!is.null(sp) && nrow(sp) > 0) {
                            # Find the m/z value with the highest intensity in the spectrum
                            base_mz <- sp[which.max(sp[, 2]), 1]
                        } else {
                            base_mz <- NA
                        }

                    # Store results
                    peak_temp_list[[i]] <- tibble(
                            start_index  = peak_start,
                            max_index = peak_max,
                            end_index = peak_end,
                            start_time = tic$time[peak_start],
                            max_time = tic$time[peak_max],
                            end_time = tic$time[peak_end],
                            max_peak_tic = tic$tic[peak_max],
                            base_mz = base_mz
                        ) %>%
                        mutate(duration = end_time - start_time)
                }

                # Combine
                peak_info <- do.call(rbind, peak_temp_list)
                rownames(peak_info) <- NULL
                peak_list[[basename(file)]] <- peak_info
            }
            close(ms)
        }
    return(list(tics = tic_list, peaks = peak_list))
}
source("ticsnpeaks.r")
e <- ticsnpeaks(spec_index, 3) # fetches tics and lists the high peaks
## Processing file: Y15A.mzML 
## Processing file: Y19A.mzML 
## Processing file: Y20A.mzML 
## Processing file: Y26A.mzML 
## Processing file: Y28A.mzML 
## Processing file: Y29_A.mzML 
## Processing file: Y33_A.mzML 
## Processing file: Y34A.mzML 
## Processing file: Y40A.mzML 
## Processing file: Y49_A.mzML 
## Processing file: Y50A.mzML 
## Processing file: Y51_A.mzML 
## Processing file: Y52_A.mzML 
## Processing file: Y62A.mzML 
## Processing file: Y78_A.mzML 
## Processing file: Y83A.mzML 
## Processing file: Y84A.mzML 
## Processing file: Y85_A.mzML 
## Processing file: Y91_A.mzML 
## Processing file: Y117A.mzML 
## Processing file: Y119A.mzML 
## Processing file: Y134_A.mzML 
## Processing file: Y151A.mzML 
## Processing file: Y152_A.mzML 
## Processing file: Y156_A.mzML 
## Processing file: Y169A.mzML 
## Processing file: Y172A.mzML 
## Processing file: Y179A.mzML 
## Processing file: Y215A.mzML 
## Processing file: Y217_A.mzML 
## Processing file: G4A.mzML 
## Processing file: G5A.mzML 
## Processing file: G9A.mzML 
## Processing file: G10A.mzML 
## Processing file: G19A.mzML 
## Processing file: G20A.mzML 
## Processing file: G21A.mzML 
## Processing file: G22A.mzML 
## Processing file: G27A.mzML 
## Processing file: G28A.mzML 
## Processing file: G30A.mzML 
## Processing file: G34A.mzML 
## Processing file: G35A.mzML 
## Processing file: G39A.mzML 
## Processing file: G41A.mzML 
## Processing file: G42A.mzML 
## Processing file: G47A.mzML 
## Processing file: G48A.mzML 
## Processing file: G49A.mzML 
## Processing file: G55_A.mzML 
## Processing file: G56A.mzML 
## Processing file: G57A.mzML 
## Processing file: G60A.mzML 
## Processing file: G61A.mzML 
## Processing file: G62A.mzML 
## Processing file: G63A.mzML 
## Processing file: G66A.mzML 
## Processing file: G67A.mzML 
## Processing file: G70_A.mzML 
## Processing file: G71_A.mzML 
## Processing file: G73A.mzML 
## Processing file: G75A.mzML 
## Processing file: G80A.mzML 
## Processing file: G81A.mzML 
## Processing file: G87A.mzML 
## Processing file: G88A.mzML 
## Processing file: G90A.mzML 
## Processing file: G91A.mzML 
## Processing file: G93A.mzML 
## Processing file: G94A.mzML 
## Processing file: G95A.mzML 
## Processing file: G96A.mzML 
## Processing file: G97A.mzML 
## Processing file: G98A.mzML 
## Processing file: G99A.mzML 
## Processing file: G104A.mzML 
## Processing file: K6A.mzML 
## Processing file: K7A.mzML 
## Processing file: K8A.mzML 
## Processing file: K9A.mzML 
## Processing file: K10A.mzML 
## Processing file: K11A.mzML 
## Processing file: K12A.mzML 
## Processing file: K13A.mzML 
## Processing file: K14A.mzML 
## Processing file: K15A.mzML 
## Processing file: K16A.mzML 
## Processing file: K17A.mzML 
## Processing file: K18A.mzML 
## Processing file: K19A.mzML 
## Processing file: K20A.mzML 
## Processing file: K21A.mzML 
## Processing file: K22A.mzML 
## Processing file: K23A.mzML 
## Processing file: K24A.mzML 
## Processing file: K25A.mzML 
## Processing file: K26A.mzML 
## Processing file: K27A.mzML 
## Processing file: K28A.mzML 
## Processing file: K29A.mzML 
## Processing file: K30A.mzML 
## Processing file: K31A.mzML 
## Processing file: K32A.mzML 
## Processing file: K33A.mzML 
## Processing file: K34A.mzML
summary(e) # the overall structure of function's output
##       Length Class  Mode
## tics  105    -none- list
## peaks 105    -none- list
head((e$tics[[1]]), n = 30) # print first 30 rows of TIC data for the first spectrum
##       time   tic
## 1  257.325 11874
## 2  257.426 13827
## 3  257.527 12015
## 4  257.628 13322
## 5  257.729 12393
## 6  257.831 12461
## 7  257.932 10035
## 8  258.033  9754
## 9  258.134 11972
## 10 258.236 10696
## 11 258.337 10185
## 12 258.439 11520
## 13 258.540 10274
## 14 258.641  9594
## 15 258.742 10720
## 16 258.844  9818
## 17 258.945 10534
## 18 259.046 10343
## 19 259.148 11482
## 20 259.249  9802
## 21 259.350 11165
## 22 259.452 10142
## 23 259.553  9610
## 24 259.655 10377
## 25 259.756 11029
## 26 259.857 12006
## 27 259.959 11370
## 28 260.060 12293
## 29 260.161 11131
## 30 260.262  9310
print(e$peaks[[1]]) # print the list of peaks for the first spectrum
## # A tibble: 3 × 9
##   start_index max_index end_index start_time max_time end_time max_peak_tic
##         <int>     <int>     <int>      <dbl>    <dbl>    <dbl>        <dbl>
## 1        8061      8071      8083      1074.    1075.    1076.      3520526
## 2        9193      9200      9206      1188.    1189.    1190.      1644834
## 3       15351     15386     15413      1812.    1816.    1818.      8040740
## # ℹ 2 more variables: base_mz <dbl>, duration <dbl>
print(e$peaks[[2]]) # print the list of peaks for the second spectrum
## # A tibble: 4 × 9
##   start_index max_index end_index start_time max_time end_time max_peak_tic
##         <int>     <int>     <int>      <dbl>    <dbl>    <dbl>        <dbl>
## 1        8059      8075      8087      1074.    1075.    1077.      7768377
## 2        9187      9210      9222      1188.    1191.    1192.     10623814
## 3       11192     11194     11194      1391.    1392.    1392.      1637062
## 4       15343     15378     15398      1812.    1816.    1818.      7868235
## # ℹ 2 more variables: base_mz <dbl>, duration <dbl>

The data for the PCA should not be skewed, so we tested various approaches of descewing the data and chose the Box-Cox transformation. The next function does this transformation as well as consequent scaling before proceeding to PCA analysis.

PCA of TICS Function:

process_tics <- function(
                    data = NULL,
                    index_tbl = NULL,
                    only_high_tic = TRUE,
                    expander = 10 ) {
    
    if (is.null(data) || is.null(index_tbl)) stop("No data")

    # loadings
        # dependencies
            source("settings.r")
            savepath <- ifelse(exists("outpath"), outpath, NULL)
            source("load_pckgs.r")
            source("pc_plotter.r")
            dependencies <-  c("dplyr", "tidyr", "purrr", "IRanges", "e1071", "MASS", "ggplot2", "patchwork", "ggplotify", "ggpubr", "cowplot")
            invisible(lapply(dependencies, load_pkg))
            
            lookup_base_mz <- function(rt) {
                all_peaks %>%
                    dplyr::filter(floor(start_time) <= ceiling(rt), ceiling(end_time) >= floor(rt)) %>%
                    dplyr::pull(base_mz)
            }

            all_peaks <- dplyr::bind_rows(data$peaks)

    # interpolating
        message("\nInterpolating...")

        # Determine the common time range
            common_min <- max(sapply(data$tics, function(df) min(df$time)))
            common_max <- min(sapply(data$tics, function(df) max(df$time)))
            n_points <- min(sapply(data$tics, nrow))
            max_len_dif <- max(sapply(data$tics, nrow)) - n_points

        # Define a common time grid (choose the number of points based on your desired resolution)
            common_time <- seq(common_min, common_max, length.out = n_points)
            cat("\tCommon time length set to ", n_points, "; the range is from ", common_min, "to", common_max, "\n")
            cat("\tMaximum length difference between samples:", max_len_dif, "\n")

        # Resample every TIC using linear interpolation:
            tic_interp <- purrr::map(data$tics, function(df) {
                approx(x = df$time, y = df$tic, xout = common_time)$y
            })
            
            tic_matrix <- do.call(rbind, tic_interp)
            colnames(tic_matrix) <- as.character(round(common_time, 2))
            cat("\tDimensions of interpoled tic matrix:", dim(tic_matrix), "\n")

    # subset to only high tics
        if (only_high_tic) {
            message("\nSubsetting to high tics...")
            hi_tic_ranges <- all_peaks %>%
                dplyr::select(start_index, end_index) %>%
                { IRanges::IRanges(start = .$start_index, end = .$end_index) } %>%
                IRanges::reduce() %>%
                as.data.frame() %>%
                dplyr::as_tibble() %>%
                dplyr::arrange(start) %>%
                dplyr::mutate( # expand the range just for case
                    start = start - expander,
                    end = end + expander,
                    width = end - start
                )

            selected_scans <- unlist(lapply(seq_len(nrow(hi_tic_ranges)), function(i) {
                hi_tic_ranges$start[i]:hi_tic_ranges$end[i]
            }))

            cat("\tNumber of time ranges:", nrow(hi_tic_ranges), "; total range:", sum(hi_tic_ranges$width), "\n")
            tic_matrix <- tic_matrix[, selected_scans, drop = FALSE]
            cat("\tDimensions of subsampled tic matrix:", dim(tic_matrix), "\n")
        }

    # save the originbal colnames (ret time indices)
        colnames <- colnames(tic_matrix)

    # Box-Cox transformation and scaling
        message("\nBox-Coxing and scaling...")
        tic_matrix_boxcox <- apply(tic_matrix, 2, function(x) {
            bc <- boxcox(x ~ 1, plotit = FALSE)
            lambda <- bc$x[which.max(bc$y)]
            
            if (abs(lambda) < 1e-6) {
                # When lambda is very close to 0, use logarithm
                return(log(x))
            } else {
                return((x^lambda - 1) / lambda)
            }
        })

        tic_matrix <- matrix(tic_matrix_boxcox, nrow = nrow(tic_matrix), ncol = ncol(tic_matrix))
        
        tic_matrix <- scale(tic_matrix)
        
        colnames(tic_matrix) <- colnames # restore the colnames

    # Check the skewness
        skew_values <- apply(tic_matrix, 2, e1071::skewness)
        cat("\tSkewness:\n")
        print(summary(skew_values))

        if (skew_values[4] > 1) stop("Skewness is larger than 1...")

    # PCA
        # analysis
            message("\nPrincipal Component Analysis...")
            pca_result <- prcomp(tic_matrix, center = FALSE, scale. = FALSE)
            pca_summary <- summary(pca_result)
            cat("\tSummary of the first five principal components:\n")
            print(pca_summary$importance[, 1:5])
            #plot( # The variance explained by PCs
            #    pca_result,
            #    type = "l",
            #    main = "Variance, explained by PCAs"
            #    )

        # visualization
            # Prepare the PCA data for visualisation
                message("\nCooking the plot...")
                pca_df <- as.data.frame(pca_result$x)
                pca_df$file <- names(data$tics)
                pca_plot <- index_tbl %>% 
                    dplyr::mutate(file = basename(path)) %>% 
                    dplyr::select(file, period, site) %>%
                    dplyr::inner_join(pca_df, by = "file")

            # PC1 vs PC2 scores
                p1_title <- ifelse(
                    only_high_tic,
                    "TIC principal component analysis, subseted for high tics",
                    "TIC principal component analysis"
                    )
                
                p1 <- ggplot2::ggplot(
                        pca_plot,
                        ggplot2::aes(x = PC1, y = PC2, color = period, shape = site)
                    ) +
                    ggplot2::geom_point(size = 3) +
                    ggplot2::theme_minimal() +
                    ggplot2::labs(
                        title = p1_title,
                        x = "PC1",
                        y = "PC2"
                        )

            # first PCs loadings
                t <- pc_plotter(pca_result, pc = 1)
                p2 <- t$plot
                top_loadings <- t$sel_loadings
                
                t <- pc_plotter(pca_result, pc = 2)
                p3 <- t$plot
                top_loadings <- c(top_loadings, t$sel_loadings)

            # create a table of ret times and baze m/z-s which are responsible for the observed clustering
                noteworthy <- dplyr::tibble(
                        ret_time = sort(top_loadings),
                        base_mz = purrr::map_chr(
                            sort(top_loadings), ~ {
                                matching <- lookup_base_mz(.x)
                                if(length(matching) == 0) { NA } else { paste(unique(matching), collapse = ", ") }
                            })
                        )

            # split the table
                mid <- ceiling(nrow(noteworthy) / 2)
                ntwrth_left  <- noteworthy[1:mid, ]
                ntwrth_right <- noteworthy[(mid + 1):nrow(noteworthy), ]

            # Create table grobs
                p4_left <- ggpubr::ggtexttable(ntwrth_left, rows = NULL, theme = ggpubr::ttheme("minimal")) +
                    ggplot2::theme(plot.margin = ggplot2::margin(5, 5, 5, 5))
                p4_right <- ggpubr::ggtexttable(ntwrth_right, rows = NULL, theme = ggpubr::ttheme("minimal")) +
                    ggplot2::theme(plot.margin = ggplot2::margin(5, 5, 5, 5))

                p4_table <- cowplot::plot_grid(p4_left, p4_right, ncol = 2, align = "hv")

                p4_title <- cowplot::ggdraw() +
                    cowplot::draw_label("Noteworthy peaks", fontface = 'bold', size = 16, x = 0.5, hjust = 0.5)

                p4 <- cowplot::plot_grid(p4_title, p4_table, ncol = 1, rel_heights = c(0.12, 1))

            # Combine the plots
                if (only_high_tic) {
                    combined_plot <- p3 + p1 + p4 + p2 + plot_layout(ncol = 2, nrow = 2)
                } else {
                    combined_plot <- p1
                }

        results <- list(plot = combined_plot)
    return(results)
}
source("process_tics.r")
ticz <- process_tics(data = e, index_tbl = spec_index, only_high_tic = F)
##  Common time length set to  17856 ; the range is from  257.341 to 2069.707 
##  Maximum length difference between samples: 41 
##  Dimensions of interpoled tic matrix: 105 17856
##  Skewness:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.46362  0.04883  0.10339  0.10481  0.15585  0.59073
##  Summary of the first five principal components:
##                              PC1      PC2      PC3      PC4      PC5
## Standard deviation     110.48748 38.79363 33.33107 22.56524 19.37091
## Proportion of Variance   0.68366  0.08428  0.06222  0.02852  0.02101
## Cumulative Proportion    0.68366  0.76795  0.83016  0.85868  0.87969
plot_saver(ticz$plot, outpath, "tics_all.png", 10, 10, 100)

Plot image

No prominent clustering: With more variables than samples, PCA can overfit to noise, leading to misleading principal components. So our next step is to shrink the TIC data to only high intencity peaks of TICs, which have already been calculated by ticsnpeaks function above (the hot areas are accumulated from all the samples), for that we call process_tics function again with only_high_tic parameter set to TRUE.

source("process_tics.r")
ticz <- process_tics(data = e, index_tbl = spec_index, only_high_tic = T)
##  Common time length set to  17856 ; the range is from  257.341 to 2069.707 
##  Maximum length difference between samples: 41 
##  Dimensions of interpoled tic matrix: 105 17856
##  Number of time ranges: 57 ; total range: 3480 
##  Dimensions of subsampled tic matrix: 105 3537
##  Skewness:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.37168  0.06673  0.13890  0.14272  0.21547  0.59073
##  Summary of the first five principal components:
##                             PC1      PC2      PC3      PC4      PC5
## Standard deviation     48.39170 17.08743 14.22424 10.53953 9.445137
## Proportion of Variance  0.66207  0.08255  0.05720  0.03141 0.025220
## Cumulative Proportion   0.66207  0.74462  0.80183  0.83323 0.858460
plot_saver(ticz$plot, outpath, "tics_sel.png", 10, 10, 100)

Plot image

Now we can see some clustering in the space of the first two principal components, where the period (Shang & Zhou) might also have an impact. First five PC-s (out of 105) explain most of the variance, and first two of them have the most pronounced power. Each principal component is a linear combination of all original variables (total ion intencities per retention time), where each variable is multiplied by a specific coefficient known as a loading. The variables with bigger absolute loadings in PC1 and PC2 have stronger impact on visual clusterization observed at the plot of PC1 and PC2. We have selected those variables, which were 0.95 to maximum loading in PC1 and PC2 and listed them as a noteworthy peaks retention times list, the baze_mz is also presented for those peaks. The baze_mz is what is the m/z value of the most intense peak in a given retention time. It might be interesting to understand if the discovered noteworthy peaks do have some meaning for the residue analysis.